PanMyeloid Assignment–Data Exploration

Data Exploration Notebook


1. Environment Setting

# Single-cell required
library(Seurat)
library(SeuratData)
library(SeuratDisk)
library(SeuratWrappers)

# For data analysis
library(rlang)
library(dplyr)
library(tidyr)
library(stringr)

# For plotting
library(ggplot2)
library(cowplot)
library(ggridges)
library(ggsci)
packageVersion("Seurat")
## [1] '4.0.4'

2. Import and Examine Data

hfile <- Connect("../processedData/alldata.obj/regressed-alldata.h5seurat")
hfile$index()
## Data for assay RNA★ (default assay)
##    counts      data    scale.data
##      ✔          ✔          ✔
# Convert("../processedData/alldata.obj/regressed-alldata.h5ad", dest = "h5seurat", overwrite = TRUE)
alldata <- LoadH5Seurat("../processedData/alldata.obj/regressed-alldata.h5seurat",meta.data=F)
alldata.meta <- read.csv("../processedData/alldata.obj/alldata-metadata.csv",header = T, row.names = 1)
alldata@meta.data <- alldata.meta
alldata
## An object of class Seurat 
## 10390 features across 45251 samples within 1 assay 
## Active assay: RNA (10390 features, 0 variable features)
head(alldata@meta.data)
alldata@meta.data %>% 
    ggplot(aes(x=patient, fill=patient)) + 
    geom_bar() +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold"),legend.position="none") +
    ggtitle("NCells")

* Cell number check

cell_tech <- alldata@meta.data %>% 
  ggplot(aes(fill=tech_10X, x=tech_10X)) + 
  geom_bar(alpha = 0.75) + ggtitle("NCells") +
  theme_classic() +
  scale_fill_npg()
cell_cancer <- alldata@meta.data %>% 
  ggplot(aes(fill=cancer, x=cancer)) + 
  geom_bar(alpha = 0.75) + ggtitle("NCells") +
  theme_classic() +
  scale_fill_simpsons()
plot_grid(cell_tech,cell_cancer,rel_widths = c(1,2))

  • UMI counts (transcripts) per cell
count_tech <- alldata@meta.data %>% 
  ggplot(aes(fill=tech_10X, x=n_counts)) + ggtitle("NCounts") +
  geom_density(alpha = 0.3) + 
  scale_x_log10() + 
  theme_classic() +
  scale_fill_npg()
count_cancer <- alldata@meta.data %>% 
  ggplot(aes(fill=cancer, x=n_counts)) + ggtitle("NCounts") +
  geom_density(alpha = 0.3) + 
  scale_x_log10() + 
  theme_classic() +
  scale_fill_simpsons()
plot_grid(count_tech,count_cancer,rel_widths = c(2,2))

3. Dimensionality Reduction

alldata <- FindVariableFeatures(alldata,selection.method = "vst",nfeatures = 2000)
alldata <- RunPCA(alldata, npcs = 100, verbose = FALSE)
ElbowPlot(alldata,ndims = 100)

# Set dim
pc.num=1:100
# Run UMAP
alldata <- RunUMAP(alldata, dims = pc.num, reduction = "pca")
alldata <- RunTSNE(alldata, dims = pc.num)
p1 <- DimPlot(alldata, group.by = "patient", reduction = "umap") + NoLegend()
p2 <- DimPlot(alldata, group.by = "patient", reduction = "tsne") + NoLegend()
plot_grid(p1,p2,rel_widths = c(2,2))

p3 <- DimPlot(alldata, group.by = "tech_10X", reduction = "umap") 
p4 <- DimPlot(alldata, group.by = "tech_10X", reduction = "tsne")
plot_grid(p3,p4,rel_widths = c(2,2))

p5 <- DimPlot(alldata, group.by = "tech", reduction = "umap") 
p6 <- DimPlot(alldata, group.by = "tech", reduction = "tsne")
plot_grid(p5,p6,rel_widths = c(2,2))

p7 <- DimPlot(alldata, group.by = "MajorCluster", reduction = "umap") 
p8 <- DimPlot(alldata, group.by = "MajorCluster", reduction = "tsne")
plot_grid(p7,p8,rel_widths = c(2,2))

4. Harmony Integration

library(harmony)
library(Rcpp)
library(rlang)
dat.h <- alldata
integrated.by.both.h <- harmony::RunHarmony(dat.h, group.by.vars =c("tech_10X","patient"), assay.use = "RNA", plot_convergence = TRUE)

# Run UMAP
integrated.by.both.h <- RunUMAP(integrated.by.both.h, dims = 1:100, reduction = "harmony")
DimPlot(integrated.by.both.h, group.by = c("tech_10X", "MajorCluster"), ncol = 2,reduction = "umap")

p_cls <- DimPlot(object = integrated.by.both.h, group.by = "MajorCluster", reduction = "umap")
p_patient <- DimPlot(object = integrated.by.both.h, group.by = "patient", reduction = "umap") +NoLegend()
plot_grid(p_patient,p_cls,rel_widths = c(2,2))

5. LIGER Integration

library(rliger)
dat.l <- alldata
dat.l <- ScaleData(dat.l, split.by = "tech_10X", do.center = FALSE)
dat.l <- RunOptimizeALS(dat.l, k = 20, lambda = 5, split.by = "tech_10X",)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Finished in 7.739579 mins, 30 iterations.
## Max iterations set: 30.
## Final objective delta: 1.651939e-05.
## Best results with seed 1.
dat.l <- RunQuantileNorm(dat.l, split.by = "tech_10X")
# Dimensional reduction and plotting
dat.l <- RunUMAP(dat.l, dims = 1:ncol(dat.l[["iNMF"]]), reduction = "iNMF")
DimPlot(dat.l, group.by = c("tech_10X", "MajorCluster"), ncol = 2)

6. Seurat3 Integration


Posing error: caught segfault, address (nil), cause ‘memory not mapped’

Issue#4628 exists on GitHub with bug label but no responses.

Maybe due to insufficient menory? Anyway, just show the codes below.


dat.s <- alldata
split_seurat <- SplitObject(dat.s, split.by = "patient")

# Normalize and identify variable features for each dataset independently
split_seurat <- lapply(X = split_seurat, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat)

Perform CCA, find the best buddies or anchors and filter incorrect anchors.

# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "LogNormalize", 
                                        anchor.features = integ_features,
                                        dims = 1:20, reduction = "rpca")

Integrate across conditions.

# Integrate across conditions
integrated.s <- IntegrateData(anchorset = integ_anchors, normalization.method = "LogNormalize")

7. LISI Evaluation

# devtools::install_github("immunogenomics/lisi")
library(lisi)
library(Rcpp)

7.1 Check Before Integration

umap.co <- Embeddings(object =alldata[["umap"]])
meta.df <- data.frame(patient = alldata$patient, tech = alldata$tech, tech_10X = alldata$tech_10X)
umap.res <- compute_lisi(umap.co, meta.df, c("patient", "tech", "tech_10X"))
# save(umap.res,tsne.res,file = "../processedData/tmp.data/ori-lisi-res.RData")
summary(umap.res)
##     patient            tech          tech_10X    
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 1.294   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 2.564   Median :1.000   Median :1.025  
##  Mean   : 3.264   Mean   :1.006   Mean   :1.170  
##  3rd Qu.: 4.709   3rd Qu.:1.000   3rd Qu.:1.228  
##  Max.   :13.282   Max.   :2.000   Max.   :2.607

Each row in the output data frame corresponds to a cell from X. The score (e.g. max 13.282) indicates the effective number of different categories represented in the local neighborhood of each cell. If the cells are well-mixed, then we might expect the LISI score to be near 46 for a categorical variable with 46 categories(e.g. “patient”).

vis_ori <- rbind(data.frame(Score = umap.res$patient, Batch = "Patient", Method = "None"),
                data.frame(Score = umap.res$tech, Batch = "Tech", Method = "None"),
                data.frame(Score = umap.res$tech_10X, Batch = "Tech_10X", Method = "None"))

7.2 Harmony Output

h.both.umap.co <- Embeddings(object =integrated.by.both.h[["umap"]])
h.both.umap.res <- compute_lisi(h.both.umap.co, meta.df, c("patient", "tech", "tech_10X"))
summary(h.both.umap.res)
##     patient            tech          tech_10X    
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 2.976   1st Qu.:1.000   1st Qu.:1.145  
##  Median : 5.123   Median :1.002   Median :1.463  
##  Mean   : 5.414   Mean   :1.054   Mean   :1.520  
##  3rd Qu.: 7.510   3rd Qu.:1.039   3rd Qu.:1.851  
##  Max.   :17.281   Max.   :2.000   Max.   :3.000
vis_h <- rbind(data.frame(Score = h.both.umap.res$patient, Batch = "Patient", Method = "Harmony"),
                data.frame(Score = h.both.umap.res$tech, Batch = "Tech", Method = "Harmony"),
                data.frame(Score = h.both.umap.res$tech_10X, Batch = "Tech_10X", Method = "Harmony"))

7.3 LIGER Output

l.both.umap.co <- Embeddings(object =dat.l[["umap"]])
l.both.umap.res <- compute_lisi(l.both.umap.co, meta.df, c("patient", "tech", "tech_10X"))
summary(l.both.umap.res)
##     patient            tech          tech_10X    
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 2.418   1st Qu.:1.000   1st Qu.:1.145  
##  Median : 4.442   Median :1.006   Median :1.448  
##  Mean   : 4.804   Mean   :1.060   Mean   :1.500  
##  3rd Qu.: 6.832   3rd Qu.:1.064   3rd Qu.:1.812  
##  Max.   :17.347   Max.   :2.000   Max.   :2.992
vis_l <- rbind(data.frame(Score = l.both.umap.res$patient, Batch = "Patient", Method = "LIGER"),
                data.frame(Score = l.both.umap.res$tech, Batch = "Tech", Method = "LIGER"),
                data.frame(Score = l.both.umap.res$tech_10X, Batch = "Tech_10X", Method = "LIGER"))

7.4 Scanorama Output

# Convert("../processedData/alldata.obj/finish-integration-with-umap.h5ad", dest = "h5seurat", overwrite = TRUE)
scanorama <- LoadH5Seurat("../processedData/alldata.obj/finish-integration-with-umap.h5seurat",meta.data=F,reductions = "umap")
scanorama.meta <- read.csv("../processedData/alldata.obj/alldata-metadata.csv",header = T, row.names = 1)
scanorama@meta.data <- scanorama.meta
scanorama
## An object of class Seurat 
## 10390 features across 45251 samples within 1 assay 
## Active assay: RNA (10390 features, 0 variable features)
##  1 dimensional reduction calculated: umap
scanorama.umap.co <- Embeddings(object =scanorama[["umap"]])
scanorama.umap.res <- compute_lisi(scanorama.umap.co, meta.df, c("patient", "tech", "tech_10X"))
summary(scanorama.umap.res)
##     patient            tech          tech_10X    
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 1.640   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 2.977   Median :1.000   Median :1.004  
##  Mean   : 3.864   Mean   :1.054   Mean   :1.169  
##  3rd Qu.: 5.692   3rd Qu.:1.000   3rd Qu.:1.161  
##  Max.   :15.565   Max.   :2.000   Max.   :2.971
vis_sc <- rbind(data.frame(Score = scanorama.umap.res$patient, Batch = "Patient", Method = "Scanorama"),
                data.frame(Score = scanorama.umap.res$tech, Batch = "Tech", Method = "Scanorama"),
                data.frame(Score = scanorama.umap.res$tech_10X, Batch = "Tech_10X", Method = "Scanorama"))

7.5 scGen Output

scgen <- read.csv("../processedData/alldata.obj/scgen-data-umap.csv")
scgen.umap.res <- compute_lisi(scanorama.umap.co, meta.df, c("patient", "tech", "tech_10X"))
summary(scgen.umap.res)
##     patient            tech          tech_10X    
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 1.640   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 2.977   Median :1.000   Median :1.004  
##  Mean   : 3.864   Mean   :1.054   Mean   :1.169  
##  3rd Qu.: 5.692   3rd Qu.:1.000   3rd Qu.:1.161  
##  Max.   :15.565   Max.   :2.000   Max.   :2.971
vis_scgen <- rbind(data.frame(Score = scgen.umap.res$patient, Batch = "Patient", Method = "scGen"),
                data.frame(Score = scgen.umap.res$tech, Batch = "Tech", Method = "scGen"),
                data.frame(Score = scgen.umap.res$tech_10X, Batch = "Tech_10X", Method = "scGen"))

7.6 Visualization

vis_all <- rbind(vis_ori,vis_h,vis_l,vis_sc,vis_scgen)
ggplot(vis_all, aes(x=Method, y=Score, fill=Method)) + 
    geom_violin(position="dodge") + 
    geom_boxplot(width=0.1, color="gray30", alpha=0.3, outlier.shape = 21,outlier.colour="transparent") +
    facet_wrap(~Batch, scale="free") +
    xlab("") + ylab("log10(Score)") + 
    scale_fill_simpsons(alpha = 0.8) + scale_y_log10()

8. Supplemental Information

▶ Session Info
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=zh_CN.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=zh_CN.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=zh_CN.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=zh_CN.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lisi_1.0                  rliger_1.0.0             
##  [3] patchwork_1.1.1           Matrix_1.3-4             
##  [5] harmony_0.1.0             Rcpp_1.0.7               
##  [7] ggsci_2.9                 ggridges_0.5.3           
##  [9] cowplot_1.1.1             ggplot2_3.3.5            
## [11] stringr_1.4.0             tidyr_1.1.4              
## [13] dplyr_1.0.7               rlang_0.4.11             
## [15] SeuratWrappers_0.3.0      SeuratDisk_0.0.0.9019    
## [17] stxBrain.SeuratData_0.1.1 SeuratData_0.2.1         
## [19] SeuratObject_4.0.2        Seurat_4.0.4             
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.6            igraph_1.2.6          lazyeval_0.2.2       
##   [4] splines_4.1.1         listenv_0.8.0         scattermore_0.7      
##   [7] digest_0.6.28         foreach_1.5.1         htmltools_0.5.2      
##  [10] fansi_0.5.0           magrittr_2.0.1        tensor_1.5           
##  [13] cluster_2.1.2         doParallel_1.0.16     ROCR_1.0-11          
##  [16] remotes_2.4.1         globals_0.14.0        matrixStats_0.61.0   
##  [19] R.utils_2.11.0        spatstat.sparse_2.0-0 rmdformats_1.0.2     
##  [22] colorspace_2.0-2      rappdirs_0.3.3        ggrepel_0.9.1        
##  [25] xfun_0.26             crayon_1.4.1          jsonlite_1.7.2       
##  [28] spatstat.data_2.1-0   iterators_1.0.13      survival_3.2-13      
##  [31] zoo_1.8-9             glue_1.4.2            polyclip_1.10-0      
##  [34] gtable_0.3.0          leiden_0.3.9          future.apply_1.8.1   
##  [37] abind_1.4-5           scales_1.1.1          DBI_1.1.1            
##  [40] miniUI_0.1.1.1        riverplot_0.10        viridisLite_0.4.0    
##  [43] xtable_1.8-4          reticulate_1.22       spatstat.core_2.3-0  
##  [46] mclust_5.4.7          bit_4.0.4             rsvd_1.0.5           
##  [49] htmlwidgets_1.5.4     httr_1.4.2            FNN_1.1.3            
##  [52] RColorBrewer_1.1-2    ellipsis_0.3.2        ica_1.0-2            
##  [55] pkgconfig_2.0.3       R.methodsS3_1.8.1     farver_2.1.0         
##  [58] sass_0.4.0            uwot_0.1.10           deldir_0.2-10        
##  [61] utf8_1.2.2            tidyselect_1.1.1      labeling_0.4.2       
##  [64] reshape2_1.4.4        later_1.3.0           munsell_0.5.0        
##  [67] tools_4.1.1           cli_3.0.1             generics_0.1.0       
##  [70] evaluate_0.14         fastmap_1.1.0         yaml_2.2.1           
##  [73] goftest_1.2-2         knitr_1.36            bit64_4.0.5          
##  [76] fitdistrplus_1.1-6    purrr_0.3.4           RANN_2.6.1           
##  [79] pbapply_1.5-0         future_1.22.1         nlme_3.1-153         
##  [82] mime_0.12             R.oo_1.24.0           hdf5r_1.3.4          
##  [85] compiler_4.1.1        plotly_4.9.4.1        png_0.1-7            
##  [88] spatstat.utils_2.2-0  tibble_3.1.4          bslib_0.3.0          
##  [91] stringi_1.7.4         highr_0.9             RSpectra_0.16-0      
##  [94] lattice_0.20-45       vctrs_0.3.8           pillar_1.6.3         
##  [97] lifecycle_1.0.1       BiocManager_1.30.15   spatstat.geom_2.2-2  
## [100] lmtest_0.9-38         jquerylib_0.1.4       RcppAnnoy_0.0.19     
## [103] data.table_1.14.2     irlba_2.3.3           httpuv_1.6.3         
## [106] R6_2.5.1              bookdown_0.24         promises_1.2.0.1     
## [109] KernSmooth_2.23-20    gridExtra_2.3         parallelly_1.28.1    
## [112] codetools_0.2-18      MASS_7.3-54           assertthat_0.2.1     
## [115] withr_2.4.2           sctransform_0.3.2     mgcv_1.8-37          
## [118] parallel_4.1.1        grid_4.1.1            rpart_4.1-15         
## [121] rmarkdown_2.11        Rtsne_0.15            shiny_1.7.1
▶ Server Info
  • Linux Version
## NAME="Ubuntu"
## VERSION="20.04.3 LTS (Focal Fossa)"
## ID=ubuntu
## ID_LIKE=debian
## PRETTY_NAME="Ubuntu 20.04.3 LTS"
## VERSION_ID="20.04"
## HOME_URL="https://www.ubuntu.com/"
## SUPPORT_URL="https://help.ubuntu.com/"
## BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
## PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
## VERSION_CODENAME=focal
## UBUNTU_CODENAME=focal
  • CPU info
## Architecture:                    x86_64
## CPU op-mode(s):                  32-bit, 64-bit
## Byte Order:                      Little Endian
## Address sizes:                   46 bits physical, 48 bits virtual
## CPU(s):                          96
## On-line CPU(s) list:             0-95
## Thread(s) per core:              2
## Core(s) per socket:              24
## Socket(s):                       2
## NUMA node(s):                    2
## Vendor ID:                       GenuineIntel
## CPU family:                      6
## Model:                           85
## Model name:                      Intel(R) Xeon(R) Platinum 8259L CPU @ 2.50GHz
## Stepping:                        6
## CPU MHz:                         2500.000
## CPU max MHz:                     3500.0000
## CPU min MHz:                     1200.0000
## BogoMIPS:                        5000.00
## Virtualization:                  VT-x
## L1d cache:                       1.5 MiB
## L1i cache:                       1.5 MiB
## L2 cache:                        48 MiB
## L3 cache:                        66 MiB
## NUMA node0 CPU(s):               0-23,48-71